Benchmark model

We divide the entire dataset into two sets, i.e. training set and test set in which the former constitutes 70% original data as rule of thumb. Furthermore, the stratified random sampling is employed, which is a method of sampling that involves the division of a population into smaller sub-groups known as strata. In this method, or stratification, the strata are formed based on members’ shared attributes. The reasons we use stratified sampling rather than simple random sampling include (i) it gives smaller error in estimation if measurements within strata have lower standard deviation (ii) reserving the empirical population proportion of the target variable.

df <- as_tibble(read.csv('BankChurners.csv', header = T))
df <- df %>%
    dplyr::select(-Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1, -Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2, -CLIENTNUM, -Avg_Open_To_Buy)
df <- df %>%
    mutate_if(is.character, as.factor)

set.seed(123456)
index <- caret::createDataPartition(df$Attrition_Flag, p = 0.7, list = FALSE)
train_dat <- df[index,]
test_dat <- df[-index,]
prop.table(table(df$Attrition_Flag))
## 
## Attrited Customer Existing Customer 
##         0.1606596         0.8393404
prop.table(table(train_dat$Attrition_Flag))
## 
## Attrited Customer Existing Customer 
##         0.1606715         0.8393285
prop.table(table(test_dat$Attrition_Flag))
## 
## Attrited Customer Existing Customer 
##          0.160632          0.839368

Benchmark model is just a Naive logistic regression without any other data processing techniques accompanied. Here we employ stepwise to select best feature combination with the aim of minimizing AIC.

logit_full <- glm(Attrition_Flag~ ., family = binomial(link = "logit"), data = train_dat)
logit_stepwise<- stepAIC(logit_full, k=qchisq(0.05, 1, lower.tail=F), direction = "both",trace = F)
DT::datatable(tidy((logit_stepwise)))

The benchmark model selects 11 variables, namely Gender, Dependent_count, Marital_Status, Total_Relationship_Count, Months_Inactive_12_mon, Contacts_Count_12_mon, Total_Revolving_Bal, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Trans_Ct and Total_Ct_Chng_Q4_Q1 as contributing factors for discrimination. All variables are significant at 5%. Also, no multicollinearity has been reported.

vif(logit_stepwise, merge_coef = TRUE)
##                     variable     gvif df gvif^(1/(2*df))
##  1:                   Gender 1.045163  1        1.022332
##  2:          Dependent_count 1.019390  1        1.009649
##  3:           Marital_Status 1.064858  3        1.010529
##  4: Total_Relationship_Count 1.218709  1        1.103951
##  5:   Months_Inactive_12_mon 1.037043  1        1.018353
##  6:    Contacts_Count_12_mon 1.039609  1        1.019612
##  7:      Total_Revolving_Bal 1.048028  1        1.023733
##  8:     Total_Amt_Chng_Q4_Q1 1.158903  1        1.076524
##  9:          Total_Trans_Amt 4.337972  1        2.082780
## 10:           Total_Trans_Ct 4.561120  1        2.135678
## 11:      Total_Ct_Chng_Q4_Q1 1.191626  1        1.091616

SMOTE addon

SMOTE technique requires KNN to select the best K. Following result shows K should be 25 for the elbow point.

set.seed(123)
ctrl <- trainControl(method="cv", number = 10) 
tuneGrid <- expand.grid(kmax = 3:7,                        # test a range of k values 3 to 7
                        kernel = c("rectangular", "cos"),  # regular and cosine-based distance functions
                        distance = 1:2)                    # powers of Minkowski 1 to 3
temp <- train_dat
knnFit <- train(Attrition_Flag ~ ., 
                  data = temp, #using df instead of dff due to missing values
                  method = 'knn',
                  trControl = ctrl,
                  preProcess = c('center', 'scale'),
                tuneLength=15)
plot(knnFit)

Let’s recheck the data.

table(train_dat$Attrition_Flag)
## 
## Attrited Customer Existing Customer 
##              1139              5950
balanced.data <- SMOTE(Attrition_Flag~., as.data.frame(train_dat),perc.over = 100, k = 25)
table(balanced.data$Attrition_Flag)
## 
## Attrited Customer Existing Customer 
##              2278              2278

Adding SMOTE on benchmark model.

logit_full_SMOTE <- glm(Attrition_Flag~ ., family = binomial(link = "logit"), data = balanced.data)
logit_stepwise_SMOTE<- stepAIC(logit_full_SMOTE, k=qchisq(0.05, 1, lower.tail=F), direction = "both",trace = F)
DT::datatable(tidy((logit_stepwise_SMOTE)))

The SMOTE logistic regression selects 13 variables, beside the same 11 variables as benchmark output, the SMOTE technique complements further 2 variables: Customer_Age and Card_Category. Also, no multicollinearity has been reported.

vif(logit_stepwise_SMOTE, merge_coef = TRUE)
##                     variable     gvif df gvif^(1/(2*df))
##  1:             Customer_Age 1.052147  1        1.025742
##  2:                   Gender 1.030179  1        1.014978
##  3:          Dependent_count 1.039721  1        1.019667
##  4:           Marital_Status 1.051664  3        1.008431
##  5:            Card_Category 1.075272  3        1.012169
##  6: Total_Relationship_Count 1.183490  1        1.087883
##  7:   Months_Inactive_12_mon 1.047881  1        1.023661
##  8:    Contacts_Count_12_mon 1.044314  1        1.021917
##  9:      Total_Revolving_Bal 1.065756  1        1.032355
## 10:     Total_Amt_Chng_Q4_Q1 1.126993  1        1.061599
## 11:          Total_Trans_Amt 4.475752  1        2.115597
## 12:           Total_Trans_Ct 4.550265  1        2.133135
## 13:      Total_Ct_Chng_Q4_Q1 1.140677  1        1.068025

WOE Transformation

IV is essentially a weighted sum of all the individual WOE values where the weights incorporate the absolute difference between the numerator and the denominator (WOE captures the relative difference). Generally, the variable with

bins <- woebin(train_dat,y = "Attrition_Flag",count_distr_limit = 0.05, bin_num_limit = 10, positive = 'bad|Attrited Customer')

iv <- map_df(bins, ~pluck(.x, 10, 1)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "iv") %>%
  mutate(group = case_when(iv < 0.02 ~ "Generally unpredictive",
                           iv < 0.1 ~ "Weak",
                           iv < 0.3 ~ "Medium",
                           iv <= 0.5 ~ "Strong",
                           TRUE ~ "Suspicious for overpredicting"))
iv %>%
  ggplot(aes(x = var, y = iv, fill = group)) +
  geom_col() +
  theme(text = element_text(size = 5)) +
  #scale_x_discrete(labels = abbreviate) +
  theme_minimal() +
  coord_flip()

Accordingly, Gender, Dependent_count, Education_Level, Marital_Status, Income_Category, Card_Category and Months_on_book are generally unpredictive. This result seems conflicted to previous regression.

woe_transform_df <- bins[[1]]
for (i in 2:length(bins)){
  woe_transform_df <- rbind(woe_transform_df,bins[[i]])
}
woe_transform_df %>%
  ggplot(aes(x = (bin), y = woe, group = 1)) +
  geom_line() +
  facet_wrap(~ variable) +
  theme_minimal() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Above figure reports the trend in WoE of variables. As can be seen, there is a negative linear relationship between the odd of P(Y=1) and the WoE of Total_Amt_Chng_Q4_Q1, Total_Ct_Chng_Q4_Q1, Total_Relationship_Count, Total_Trans_Ct. Some several variables show non-linearity between odd of P(Y=1) and WoE, namely Avg_Utilization_Ratio, Credit_Limit, Total_Revolving_Bal, Total_Trans_Amt. We employ isotonic regression to attempt to implement the monotonic binning.

non_linear <- c('Avg_Utilization_Ratio', 'Credit_Limit', 'Total_Revolving_Bal', 'Total_Trans_Amt')
dt <- df %>%
  dplyr::select(non_linear, Attrition_Flag) %>%
  mutate(Attrition_Flag = ifelse(Attrition_Flag == 'Attrited Customer', 1, 0) ) %>%
  as.data.frame()
result <- list()
for (j in non_linear) {
  result[[j]] <- iso_bin(x = dt[,j], y = dt$Attrition_Flag)$cut
}
breaks_adj <- woebin(train_dat, y = "Attrition_Flag", breaks_list=result,
                     count_distr_limit = 0.05, bin_num_limit = 10, positive = 'bad|Attrited Customer')
woe_transform_df_new <- breaks_adj[[1]]
for (i in 2:length(breaks_adj)){ 
  woe_transform_df_new <- rbind(woe_transform_df_new,breaks_adj[[i]])
}
woe_transform_df_new %>%
  ggplot(aes(x = (bin), y = woe, group = 1)) +
  geom_line() +
  facet_wrap(~ variable) +
  theme_minimal() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

We next implement the WoE transformation for training set and run logistic regression.

iv_new <- map_df(breaks_adj, ~pluck(.x, 10, 1)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "iv") %>%
  mutate(group = case_when(iv < 0.02 ~ "Generally unpredictive",
                           iv < 0.1 ~ "Weak",
                           iv < 0.3 ~ "Medium",
                           iv <= 0.5 ~ "Strong",
                           TRUE ~ "Suspicious for overpredicting"))

iv_new <- iv_new %>%
  filter(!group %in% c("Generally unpredictive"))

train_dat_WOE <- train_dat %>%
  dplyr::select(one_of(unique(iv_new$var)),Attrition_Flag)
test_dat_WOE <- test_dat %>%
  dplyr::select(one_of(unique(iv_new$var)),Attrition_Flag)

train_woe_list <- woebin_ply(train_dat_WOE, breaks_adj, to = "woe")
test_woe_list <- woebin_ply(test_dat_WOE, breaks_adj, to = "woe")

logit_full_WOE<- glm(Attrition_Flag~ ., family = binomial(link = "logit"), data = train_woe_list)
logit_stepwise_WOE <- stepAIC(logit_full_WOE, k=qchisq(0.05, 1, lower.tail=F), direction = "both",trace = F)
DT::datatable(tidy((logit_stepwise_WOE)))

The WOE transformation helps to reduce the AIC of the best stepwise logistic model to 3092.3 as opposed to 3339.2 of benchmark. Interestingly, there are no categorical variables in original training set due to their general unpredictive power based on IV rules. Checking multicolinearity checks suggest Avg_Utilization_Ratio and Total_Revolving_Bal should be excluded from the model. Also, check for significance of individual variable in new models, Customer_Age should be exculded due to its p-vale less than 0.05. Our final model as follows.

logit_WOE <- glm(Attrition_Flag~ ., family = binomial(link = "logit"), data = train_woe_list %>% dplyr::select(-Total_Revolving_Bal_woe, -Avg_Utilization_Ratio_woe, -Customer_Age_woe))
DT::datatable(tidy((logit_WOE)))

PCA Transformation

Inheriting from EDA, we select 17 as the number of principal components. The model is as follows.

df.quanti <- train_dat %>% dplyr::select(-where(is.factor)) %>% mutate_all(as.numeric) %>% as.data.frame()
df.quali <- train_dat %>% dplyr::select(where(is.factor),-Attrition_Flag) %>% as.data.frame()
pca <- PCAmix(X.quanti = scale(df.quanti), X.quali = df.quali, rename.level = TRUE, ndim = 17, graph = FALSE)
temp <- as.data.frame(pca$ind$coord)
temp$Attrition_Flag <- train_dat$Attrition_Flag
logit_full_PCA <- glm(Attrition_Flag~ ., family = binomial(link = "logit"), data = temp)
logit_stepwise_PCA <- stepAIC(logit_full_PCA, k=qchisq(0.05, 1, lower.tail=F), direction = "both",trace = F)
DT::datatable(tidy((logit_stepwise_PCA)))

As can be seen, the final model contains only 2nd, 4th, 5th, 9th, 12th, 13th, 14th, 16th, and 17th components. Interestingly, The first component which constitutes the most of total variation of dataset is excluded. Actually in full model, this component is insignificant at 5%. VIF check shows there are no multicollinearity as a result of PCA technique.

Comparison

We compare the performance of several models based on the test set. Some highlights are:

test_dat$BENCHMARK <- predict(logit_stepwise, newdata = test_dat, type = "response")
test_dat$SMOTE <- predict(logit_stepwise_SMOTE, newdata = test_dat, type = "response")
test_dat$WOE <- predict(logit_WOE, newdata = test_woe_list, type = "response")
df.quanti.test <- test_dat %>% dplyr::select(-where(is.factor),-BENCHMARK, -SMOTE,-WOE) %>% mutate_all(as.numeric) %>% as.data.frame()
df.quanti.test <- test_dat %>% dplyr::select(-where(is.factor),-BENCHMARK, -SMOTE,-WOE) %>% mutate_all(as.numeric) %>% as.data.frame()
df.quali.test <- test_dat %>% dplyr::select(where(is.factor),-Attrition_Flag) %>% as.data.frame()
test_temp <- as.data.frame(predict(pca,X.quanti = scale(df.quanti.test), X.quali = df.quali.test,rename.level = TRUE, graph = FALSE))
colnames(test_temp) <- paste('dim', 1:17)
test_dat$PCA <- predict(logit_stepwise_PCA, newdata = (test_temp), type = "response")
par(mfrow = c(2,2))
plot(roc(test_dat$Attrition_Flag, test_dat$BENCHMARK, direction="<"), col="steelblue", lwd=3, main="(a) Benchmark",print.auc=TRUE)
plot(roc(test_dat$Attrition_Flag, test_dat$SMOTE, direction="<"), col="steelblue", lwd=3, main="(b) SMOTE technique",print.auc=TRUE)
plot(roc(test_dat$Attrition_Flag, test_dat$WOE, direction="<"), col="steelblue", lwd=3, main="(c) WOE technique",print.auc=TRUE)
plot(roc(test_dat$Attrition_Flag, test_dat$PCA, direction="<"), col="steelblue", lwd=3, main="(d) PCA technique",print.auc=TRUE)

par(mfrow = c(1,1))
roc_step <- roc(test_dat$Attrition_Flag, test_dat$BENCHMARK, direction="<")
d <-coords(roc_step,"best","threshold",transpose=T)
cm_benchmark <- confusionMatrix(as.factor(ifelse(test_dat$BENCHMARK >= d[[1]],"Existing Customer", "Attrited Customer")), test_dat$Attrition_Flag)
recall_bm <- cm_benchmark$byClass[1] #Sensitivity: 0.8750000
roc_step <- roc(test_dat$Attrition_Flag, test_dat$SMOTE, direction="<")
d <-coords(roc_step,"best","threshold",transpose=T)
cm_SMOTE <- confusionMatrix(as.factor(ifelse(test_dat$SMOTE >= d[[1]],"Existing Customer", "Attrited Customer")), test_dat$Attrition_Flag)
recall_SMOTE <- cm_SMOTE$byClass[1] #Sensitivity: 0.8750000
roc_step <- roc(test_dat$Attrition_Flag, test_dat$WOE, direction="<")
d <-coords(roc_step,"best","threshold",transpose=T)
cm_WOE <- confusionMatrix(as.factor(ifelse(test_dat$WOE >= d[[1]],"Existing Customer", "Attrited Customer")), test_dat$Attrition_Flag)
recall_WOE <- cm_WOE$byClass[1] #Sensitivity: 0.8647541
roc_step <- roc(test_dat$Attrition_Flag, test_dat$PCA, direction="<")
d <-coords(roc_step,"best","threshold",transpose=T)
cm_PCA <- confusionMatrix(as.factor(ifelse(test_dat$PCA >= d[[1]],"Existing Customer", "Attrited Customer")), test_dat$Attrition_Flag)
recall_PCA <- cm_PCA$byClass[1] #Sensitivity: 0.7315574
rbind(recall_bm,recall_SMOTE,recall_WOE,recall_PCA) %>%
    as.data.frame() %>%
    mutate(Model = c("Benchmark","SMOTE","WOE","PCA")) %>%
    ggplot() +
    geom_col(aes(x = Model, y = Sensitivity), fill = 'steelblue') +
    geom_text(aes(x = Model,y = Sensitivity, label = round(Sensitivity,3)), vjust = -0.5) +
    theme_minimal() +
    ylab('Recall')